# R Options
options(stringsAsFactors=FALSE, "citation_format"="pandoc")
# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
plan("multiprocess") # by default, this uses all available cores; use "workers=4" for example, to use 4 cores
options(future.globals.maxSize = 800000000) # increase if the maximum allowed size in bytes of objects exported by future is exceeded
# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix
# Tables: kableExtra
# Plots: ggsci, ggpubr
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR
# Other: sessioninfo
# Knitr default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=FALSE, warning=FALSE, tidy=FALSE, fig.width=10)
# Python needed for clustering, umap, other python packages
# Change location if necessary
Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python")
# biomaRt mirror: NULL means the first working mirror is used
mirror = NULLThis code chunk contains all parameters that are set specifically for the project.
param = list()
# Project ID
param$project = "pbmc"
# Input data path in case Cell Ranger was run
param$path_data = data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
type=c("10x","smartseq2"),
path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/","test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"))
param$file_mapping_stats = NULL
# Project-specific paths
param$path_out = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/"
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# Set to NULL if no known marker genes should be plotted
param$file_known_markers = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx"
# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="external_gene_name")
param$file_annot = NULL
if (is.null(param$file_annot)) {
param$file_annot = file.path(param$path_out, paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
}
param$mart_attributes = c(param$annot_main,
c("chromosome_name", "start_position", "end_position",
"percentage_gene_gc_content", "gene_biotype", "strand", "description"))
# Prefix of mitochondrial genes
param$mt = "^MT-"
# Filters
param$cell_filter = list(nFeature_RNA=c(200,NA), percent_mt=c(NA,20))
param$feature_filter = list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell
param$samples_to_drop = c("NC", "RNA") # cells from these samples will be dropped after initial QC
# Whether or not to remove cell cycle effects
param$cc_remove = FALSE
# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE
# Additional (unwanted) variables that will be regressed out for visualisation and clustering
param$vars_to_regress = c()
# Additional (unwanted) variables to be included in the statistical tests
param$latent_vars = NULL
# When there are multiple datasets, how to integrate them:
# - method:
# - merge: Just merge them since no integration is needed (e.g. samples were multiplexed on the same chip)
# - standard: Anchors are computed for all pairs of datasets. This will give all datasets the same weight during dataset integration but can be computationally intensive.
# - reference: One dataset is used as reference and anchors are computed for all other datasets. Less accurate but computationally faster.
# - reciprocal: Anchors are computed in PCA space instead of the data. Even less accurate but for very big datasets.
#
# - reference_dataset: When using method "reference", which dataset is the reference? Can be numeric or name of the dataset.
# - dimensions: Number of dimensions to consider for integration
param$integrate_samples = list(method="standard", reference_dataset=1, dimensions=30)
# Which normalisation should be used for analysis: RNA or SCT?
param$normalisation_default = "SCT"
# The number of PCs to use; adjust this parameter based on the Elbowplot
param$pc_n = 10
# Resolution of clusters; low values will lead to fewer clusters of cells
param$cluster_resolution=0.5
# Thresholds to define differentially expressed genes
param$padj = 0.05
param$log2fc = log2(1.5)
# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")
# Main colour(s) to use for plots
param$col = "palevioletred"
# Colour palette and colours used for samples
param$col_palette_samples = ggsci::pal_jama
# Colour palette and colours used for cluster
param$col_palette_cluster = ggsci::pal_startrek
# Sample data to at most n cells per dataset/sample (mainly for tests); set to NULL to deactivate
param$sample_cells = NULL
# Path to git repository
param$path_to_git = "."We begin by printing mapping statistics that have been produced prior to this workflow.
# Are statistics provided?
if (!all(is.na(param$path_data$stats))) {
# Loop through all samples and read mapping stats
mapping_stats_list = list()
for (i in 1:nrow(param$path_data)) {
if (!is.na(param$path_data$stats[i])) {
mapping_stats_list[[param$path_data$name[i]]] = read.delim(param$path_data$stats[i],
sep=",", header=FALSE, check.names=FALSE) %>%
t() %>% as.data.frame()
}
}
# Join all mapping stats tables
mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="V1")
rownames(mapping_stats) = mapping_stats[["V1"]]
mapping_stats = mapping_stats %>% dplyr::select(-V1)
colnames(mapping_stats) = names(mapping_stats_list)
# Print table to HTML
knitr::kable(mapping_stats, align="l", caption="Mapping statistics") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
} else {
message("Mapping statistics cannot be shown. No valid file provided.")
}## Mapping statistics cannot be shown. No valid file provided.
If not provided already, we read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat rownames.
# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
annot_ensembl = read.delim(param$file_annot)
} else {
annot_mart = biomaRt::useEnsembl("ensembl", dataset=param$mart_dataset, mirror=mirror, version=param$annot_version)
annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes)
write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
message("Gene annotation file was created at: ", param$file_annot)
# Note: depending on the attributes, there might be more than one row per gene
}
# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}
# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]
# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])
# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])
# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)
# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
symbol_to_ensembl_df = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_ensembl = split(symbol_to_ensembl_df[, ensembl], symbol_to_ensembl_df[, symbol])
# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
symbol_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname[, ensembl]]
symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname[, symbol])
# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])
# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})
# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]# Use biomart to translate human cell cycle genes to the species of interest
# Note: both mart objects must point to the same mirror for biomarT::getLDS to work
mart_human = GetBiomaRt(biomart="ensembl", dataset="hsapiens_gene_ensembl", mirror=mirror)
mart_myspecies = GetBiomaRt(biomart="ensembl", dataset=param$mart_dataset, mirror=GetBiomaRtMirror(mart_human))
genes_s = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$s.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)
genes_g2m = biomaRt::getLDS(attributes=c("external_gene_name"),
filters="external_gene_name",
values=cc.genes.updated.2019$g2m.genes,
mart=mart_human,
attributesL=c("external_gene_name"),
martL=mart_myspecies,
uniqueRows=TRUE)We next read the scRNA-seq dataset(s) into Seurat.
# List of Seurat objects
sc = list()
datasets = param$path_data
for (i in seq(nrow(datasets))) {
name = datasets[i,"name"]
type = datasets[i,"type"]
path = datasets[i,"path"]
# Read 10X or smartseq2
if (type == "10x") {
# Read 10X sparse matrix into a Seurat object
sc[[name]] = ReadSparseMatrix(path,
project=name,
row_name_column=1,
convert_row_names=ensembl_to_seurat_rowname)
} else if (type == "smartseq2") {
# Read counts table into a Seurat object
sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE))
}
}
# Check that sample names are unique between datasets: this can happen if multiple Smartseq2 datasets have a negative control (NC)
# In this case, just use the dataset name which is unique, e.g. set1.NC and set2.NC
sample_names = purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) })
duplicate_sample_names = purrr::reduce(sample_names, intersect)
sc = purrr::map(list_names(sc), function(n) {
if (any(sc[[n]][["orig.ident", drop=TRUE]] %in% duplicate_sample_names)) {
sc[[n]][["orig.ident.old"]] = sc[[n]][["orig.ident"]]
sc[[n]][["orig.ident"]] = n
sc[[n]] = Seurat::SetIdent(sc[[n]], value="orig.ident")
}
return(sc[[n]])
})
# Enforce unique barcodes per cell after integration
if (length(sc) > 1) {
for (i in 1:length(sc)) {
# Remove "-1" if necessary
sc[[i]] = Seurat::RenameCells(sc[[i]], new.names=gsub("-1", "", Cells(sc[[i]])))
# Add "-1", "-2", and so on
sc[[i]] = Seurat::RenameCells(sc[[i]], new.names=paste0(Cells(sc[[i]]), "-", i))
}
}
# Set up colors for samples
sample_names = purrr::flatten_chr(purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) }))
param$col_samples = GenerateColours(num_colours=length(sample_names), palette=param$col_palette_samples)
names(param$col_samples) = sample_names
# Sample cells if requested
if (!is.null(param$sample_cells)) {
sc = purrr::map(sc, function(s) {
cells = colnames(s)
return(subset(s, cells=sample(cells, min(param$sample_cells, length(cells)))))
})
}We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).
The following first table shows metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), or the above mentioned number of unique genes detected (“nFeature”). The second table shows metadata (columns) of the first 5 genes (rows), for example the number of cells with at least 1 count for the gene (“num_cells_expr”), and the number of cells with at least as many counts as set in the parameter filter section (“num_cells_expr_threshold”).
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$cell_filter)) {
return(param$cell_filter[[s]])
} else {
return(param$cell_filter)
}
})
param$feature_filter = purrr::map(list_names(sc), function(s) {
if (s %in% names(param$feature_filter)) {
return(param$feature_filter[[s]])
} else {
return(param$feature_filter)
}
})# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, Seurat::PercentageFeatureSet, pattern=param$mt, col.name="percent_mt", assay="RNA")
# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
return(s)
})
# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame()
sc_cell_metadata$orig.ident = factor(sc_cell_metadata$orig.ident,levels=unique(sc_cell_metadata$orig.ident))
# Print table
knitr::kable(head(sc[[1]][[]], 5), align="l", caption="Cell metadata, top 5 rows") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| orig.ident | nCount_RNA | nFeature_RNA | percent_mt | |
|---|---|---|---|---|
| PBMC1_10x_AAACCCACACTTGGGC-1 | pbmc_10x | 7552 | 2037 | 8.659958 |
| PBMC1_10x_AAACCCACAGGTGGAT-1 | pbmc_10x | 4773 | 1800 | 8.694741 |
| PBMC1_10x_AAACCCAGTGCTTATG-1 | pbmc_10x | 4430 | 1565 | 14.762980 |
| PBMC1_10x_AAACCCATCCGTAGTA-1 | pbmc_10x | 4512 | 1592 | 8.621454 |
| PBMC1_10x_AAACCCATCTTACACT-1 | pbmc_10x | 6663 | 1919 | 8.314573 |
# Only RNA assay at the moment
sc = purrr::map(list_names(sc), function(n) {
num_cells_expr = Matrix::rowSums(Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA") >= 1)
num_cells_expr_threshold = Matrix::rowSums(Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA") >= param$feature_filter[[n]][["min_counts"]])
sc[[n]][["RNA"]] = Seurat::AddMetaData(sc[[n]][["RNA"]], data.frame(num_cells_expr,num_cells_expr_threshold))
return(sc[[n]])
})
# Print table
knitr::kable(head(sc[[1]][["RNA"]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| feature_id | feature_name | feature_type | num_cells_expr | num_cells_expr_threshold | |
|---|---|---|---|---|---|
| TSPAN6 | ENSG00000000003 | TSPAN6 | Gene Expression | 151 | 151 |
| TNMD | ENSG00000000005 | TNMD | Gene Expression | 0 | 0 |
| DPM1 | ENSG00000000419 | DPM1 | Gene Expression | 629 | 629 |
| SCYL3 | ENSG00000000457 | SCYL3 | Gene Expression | 166 | 166 |
| C1orf112 | ENSG00000000460 | C1orf112 | Gene Expression | 45 | 45 |
# Plot QC metrics for cells
cell_qc_features = c("nFeature_RNA", "nCount_RNA", "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features,"percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)
p_list = list()
for (i in names(cell_qc_features)) {
p_list[[i]]= ggplot(sc_cell_metadata,aes_string(x="orig.ident", y=i, fill="orig.ident")) +
geom_violin(scale="width")
p_list[[i]] = PlotMystyle(p_list[[i]], title=i, legend_position="none", fill=param$col_samples) + xlab("")
# Creates a table with min/max values for filter i for each dataset
#
# Iterate over datasets
cell_filter_for_plot = purrr::map_dfr(names(param$cell_filter), function(n) {
# If filter i in cell filter of the dataset, then create dataframe with columns orig.ident, threshold and value
if (i %in% names(param$cell_filter[[n]])){
orig_ident = head(sc[[n]][["orig.ident"]],1)
data.frame(orig.ident=orig_ident, threshold=c("min","max"), value=param$cell_filter[[n]][[i]])
}
})
# Add filters as segments to plot
if (nrow(cell_filter_for_plot)>0) {
# Remove entries that are NA
cell_filter_for_plot = cell_filter_for_plot %>% dplyr::filter(!is.na(value))
p_list[[i]] = p_list[[i]] + geom_segment(data=cell_filter_for_plot,aes(x=as.integer(as.factor(orig.ident))-0.5, xend=as.integer(as.factor(orig.ident))+0.5, y=value, yend=value),lty=2, col="firebrick")
}
}
p = patchwork::wrap_plots(p_list, ncol = 3) + patchwork::plot_annotation("Distribution of feature values")
p# Correlate QC metrics for cells
p = list()
p[[1]] = ggplot(sc_cell_metadata,aes_string(x=cell_qc_features[2], y=cell_qc_features[1], colour="orig.ident")) +
geom_point()
p[[2]] = ggplot(sc_cell_metadata,aes_string(x=cell_qc_features[3], y=cell_qc_features[1], colour="orig.ident")) +
geom_point()
for (i in 1:length(p)) p[[i]] = PlotMystyle(p[[i]], col=param$col_samples)
p = patchwork::wrap_plots(p, ncol = 2) + patchwork::plot_annotation("Features plotted against each other")
if (length(sc)==1) {
p = p & theme(legend.position="bottom")
} else {
p = p + patchwork::plot_layout(guides = "collect") & theme(legend.position="bottom")
}
pThe size of the Seurat object before filtering is:
## $pbmc_10x
## An object of class Seurat
## 33694 features across 4033 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
##
## $pbmc_smartseq2.pbmc_smartseq2.sample1
## An object of class Seurat
## 33694 features across 311 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
Cells and genes are filtered based on the following thresholds:
cell_filter_list = param$cell_filter %>% unlist(recursive=FALSE)
cell_filter_tbl = cell_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(cell_filter_tbl) = names(cell_filter_list)
colnames(cell_filter_tbl) = c("Min", "Max")
feature_filter_list = param$feature_filter %>% unlist(recursive=FALSE)
feature_filter_tbl = feature_filter_list %>% purrr::reduce(rbind) %>% as.data.frame()
rownames(feature_filter_tbl) = names(feature_filter_list)
colnames(feature_filter_tbl) = "n"
knitr::kable(cell_filter_tbl, align="l", caption="Filters applied to cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Min | Max | |
|---|---|---|
| pbmc_10x.nFeature_RNA | 200 | NA |
| pbmc_10x.percent_mt | NA | 20 |
| pbmc_smartseq2.pbmc_smartseq2.sample1.nFeature_RNA | 200 | NA |
| pbmc_smartseq2.pbmc_smartseq2.sample1.percent_mt | NA | 20 |
knitr::kable(feature_filter_tbl, align="l", caption="Filters applied to genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| n | |
|---|---|
| pbmc_10x.min_counts | 1 |
| pbmc_10x.min_cells | 3 |
| pbmc_smartseq2.pbmc_smartseq2.sample1.min_counts | 1 |
| pbmc_smartseq2.pbmc_smartseq2.sample1.min_cells | 3 |
The number of excluded cells and features is as follows:
# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude = purrr::map(list_names(sc), function(n) {
filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
filter = param$cell_filter[[n]][[f]]
if (is.numeric(filter)) {
if (is.na(filter[1])) filter[1] = -Inf # Minimum
if (is.na(filter[2])) filter[2] = Inf # Maximum
idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
return(names(which(idx_exclude)))
} else if (is.character(filter)) {
return(names(which(sc[[n]][[f, drop=TRUE]] %in% filter)))
}
})
# Samples to drop
cells_orig_sample = sc[[n]][["orig.ident", drop=TRUE]]
filter_result[["sample_to_drop"]] = names(cells_orig_sample[cells_orig_sample %in% param$samples_to_drop])
return(filter_result)
})
# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
return(as.data.frame(purrr::map(s,length)))
})
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
knitr::kable(sc_cells_to_exclude_summary, align="l", caption="Number of excluded cells") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) | nFeature_RNA | percent_mt | sample_to_drop | |
|---|---|---|---|
| pbmc_10x | 286 | 271 | 0 |
| pbmc_smartseq2.pbmc_smartseq2.sample1 | 1 | 0 | 0 |
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
cells_to_keep = Cells(sc[[n]])
cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
if(length(cells_to_keep)==0) return(NULL)
else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)
if(length(sc)==1) param$integrate_samples[["method"]] = "single"# Only RNA assay at the moment
# Iterate over datasets and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
names(which(sc[[n]][["RNA"]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]]))
})
# Summarise
sc_features_to_exclude_summary = purrr::map(sc_features_to_exclude, length) %>%
t() %>% as.data.frame()
rownames(sc_features_to_exclude_summary) = c("Genes")
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Number of excluded genes") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| pbmc_10x | pbmc_smartseq2.pbmc_smartseq2.sample1 | |
|---|---|---|
| Genes | 13893 | 15754 |
# Now filter
sc = purrr::map(list_names(sc), function(n) {
features_to_keep = purrr::flatten_chr(purrr::map(Seurat::Assays(sc[[n]]), function(a) rownames(sc[[n]][[a]])))
features_to_keep = features_to_keep[!features_to_keep %in% sc_features_to_exclude[[n]]]
return(subset(sc[[n]], features=features_to_keep))
})After filtering, the size of the Seurat object is:
## $pbmc_10x
## An object of class Seurat
## 19801 features across 3608 samples within 1 assay
## Active assay: RNA (19801 features, 0 variable features)
##
## $pbmc_smartseq2.pbmc_smartseq2.sample1
## An object of class Seurat
## 17940 features across 310 samples within 1 assay
## Active assay: RNA (17940 features, 0 variable features)
In this section, we subsequently run a series of Seurat functions:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. Variable genes are selected. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.
3. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.
In addition, we run:
4. SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling). Note that although results are kept for steps 1-3 (“RNA” assay), downstream analyses will default to resulting data of step 4 (“SCT” assay).
Finally, cell cycle scores are assigned to each cell based on its normalised expression of G2/M and S phase markers, after basic normalisation (step 1). These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 4). Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. As an alternative, we can remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat (Unknown 2020a). Cell cycle effect removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.
While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.
# This function often crashes with multicore support, so stop for a moment
future::plan("sequential")
# Normalise data the original way
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE)
# Back to multicore support
future::plan("multicore")
# Probably obsolete, not needed anywhere in this report
# Select features from normalised data (unaffected by scaling)
sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method = "vst", nfeatures = 3000, verbose=FALSE)# Determine cell cycle effect
sc = purrr::map(sc, Seurat::CellCycleScoring, s.features=genes_s[,2], g2m.features=genes_g2m[,2], set.ident=FALSE)
# Determine difference between G2M and S phase scores
sc = purrr::map(sc, function(s) {
s$CC.Difference = s$S.Score - s$G2M.Score
return(s)
})
# If cell cycle effects should be removed, we first score cells
# The effect is then removed in the following chunk
if(param$cc_remove) {
# Add to vars that need to regressed out during normalisation
if (param$cc_remove_all) {
# Regress out cell cycle effect
param$vars_to_regress = unique(c(param$vars_to_regress,"S.Score", "G2M.Score"))
param$latent_vars = unique(c(param$latent_vars,"S.Score", "G2M.Score"))
} else {
# Remove all signal associated to cell cycle
param$vars_to_regress = unique(c(param$vars_to_regress,"CC.Difference"))
param$latent_vars = unique(c(param$latent_vars,"CC.Difference"))
}
}# Scale RNA assay
# Note: Removing cell cycle effects in "RNA" scaled data can be very slow
sc = purrr::map(sc, function(s) {
Seurat::ScaleData(s, features=VariableFeatures(s), vars.to.regress=param$vars_to_regress, verbose=FALSE)
})
# Run SCTransform
#
# This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'.
# vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
# paper: https://www.biorxiv.org/content/10.1101/576827v2
# Normalised data end up here: sc@assays$SCT@data
sc = purrr::map(list_names(sc), function(n) { SCTransform(sc[[n]], vars.to.regress=param$vars_to_regress, min_cells=param$feature_filter[[n]][["min_cells"]], verbose=FALSE) })
# Note: It is not guaranteed that all genes are successfully normalised with SCTransform. Consequently, some genes might be missing from the SCT assay. See: https://github.com/ChristophH/sctransform/issues/27Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells.
To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability. The top 3,000 variable genes are used for further analysis.
# Show variable genes
plist = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]]), 10)
p = Seurat::VariableFeaturePlot(sc[[n]], assay="RNA", selection.method="vst", col=c("grey", param$col))
p = PlotMystyle(p, title=n)
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
p = p + theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
return(p)
})
patchwork::wrap_plots(plist) + patchwork::plot_annotation("Variable genes")# Show variable genes
plist = purrr::map(list_names(sc), function(n) {
top10 = head(Seurat::VariableFeatures(sc[[n]], assay="SCT"), 10)
p = Seurat::VariableFeaturePlot(sc[[n]], assay="SCT", selection.method="sct", col=c("grey", param$col))
p = PlotMystyle(p, title=n)
p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
p = p + theme(legend.position=c(0.2, 0.8),legend.background=element_rect(fill=alpha("white", 0.0)))
return(p)
})
patchwork::wrap_plots(plist) + patchwork::plot_annotation("Variable genes")n_cells_rle_plot = 100
# Sample at most 100 cells per dataset and save their identity; also get list of features shared among datasets
cells_RLE_subset = purrr::map(sc, function(s) {
cells = Seurat::Cells(s)
cell_subset = sample(cells, min(n_cells_rle_plot, length(cells)))
cell_subset = sort(setNames(as.character(s[[]][cell_subset, "orig.ident", drop=TRUE]),cell_subset))
return(cell_subset)
})
cells_orig_sample = purrr::flatten_chr(cells_RLE_subset)
cells_RLE_subset = purrr::map(cells_RLE_subset, names)
shared_rna_features = purrr::reduce(purrr::map(sc, function(s) rownames(s[["RNA"]])), intersect)
shared_sct_features = purrr::reduce(purrr::map(sc, function(s) rownames(s[["SCT"]])), intersect)To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.
For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#1F77B4FF, #FF7F0EFF).
# Get counts from assay RNA
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them. Only at the plotRLE,
cell_subset_raw = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_rna_features), assay="RNA", slot="counts")
colnames(d) = paste(colnames(d), i, sep=".")
return(d)
})
cell_subset_raw = purrr::reduce(cell_subset_raw, cbind)
# Plot
p = PlotRLE(as.matrix(log2(cell_subset_raw + 1)), id=cells_orig_sample, col=param$col_samples) + labs(title="log2(raw counts + 1)")
p# Get normalised data from assay RNA
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them
cell_subset_norm1 = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_rna_features), assay="RNA", slot="data")
colnames(d) = paste(colnames(d), i, sep=".")
return(d)
})
cell_subset_norm1 = purrr::reduce(cell_subset_norm1, cbind)
# Plot
p = PlotRLE(as.matrix(cell_subset_norm1), id=cells_orig_sample, col=param$col_samples) + labs(title="Normalised data")
p # Get normalised data from assay SCT
# Note: The counts are retrieved as sparse matrices and then cbind is used to combine them
cell_subset_norm2 = purrr::map(seq(length(sc)), function(i) {
d = GetAssayData(subset(sc[[i]], cells=cells_RLE_subset[[i]], features=shared_sct_features), assay="SCT", slot="data")
colnames(d) = paste(colnames(d), i, sep=".")
return(d)
})
cell_subset_norm2 = purrr::reduce(cell_subset_norm2, cbind)
# Plot
p = PlotRLE(as.matrix(cell_subset_norm2), id=cells_orig_sample, col=param$col_samples) + labs(title="SCTransform'ed data")
p if (param$integrate_samples[["method"]]!="single") {
# Markdown text for this section (do not change intendation)
cat("## Integration of multiple datasets\n\n")
# Feature metadata is removed by Seurat merge entirely; save separately for each assay and add again afterwards
assay_names = unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } )))
# Loop through all assays and accumulate meta data
feature_data_for_assay = purrr::map(values_to_names(assay_names), function(a) {
# "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
# This step is skipped for assays that do not contain all three rypes of feature information
contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) { all(c("feature_id","feature_name","feature_type") %in% colnames(sc[[n]][[a]][[]])) })
if (all(contains_neccessary_columns)) {
feature_id_name_type = purrr::reduce(list_names(sc), function(x, y) {
df_x = sc[[x]][[a]][[c("feature_id", "feature_name", "feature_type")]]
df_y = sc[[y]][[a]][[c("feature_id", "feature_name", "feature_type")]]
new_rows = which(!rownames(df_y) %in% rownames(df_x))
rbind(df_x, df_y[new_rows,])
})
feature_id_name_type$row_names = rownames(feature_id_name_type)
} else {
feature_id_name_type = NULL
}
# For all other metadata, we prefix column names with the dataset
other_feature_data = purrr::map(list_names(sc), function(n) {
df = sc[[n]][[a]][[]] %>% dplyr::select(-dplyr::one_of("feature_id","feature_name","feature_type"))
colnames(df) = paste(n, colnames(df), sep=".")
df$row_names = rownames(df)
return(df)
})
# Now join everything by row_names by full outer join
if(!is.null(feature_id_name_type)) {
feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type),other_feature_data), dplyr::full_join, by="row_names")
} else {
feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
}
rownames(feature_data) = feature_data$row_names
feature_data$row_names = NULL
return(feature_data)
})
}# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="standard") {
# Note "Assay names should only have numbers and letters: Warnung: Keys should be one or more alphanumeric characters followed by an underscore, setting key from rna_integrated_ to rnaintegrated_" (seurat/R/object.R)
# The integration step will temporarily occupy lots of memory. However, R has problems with freeing unused memory.
# By wrapping the steps into a function, hopefully this works a bit better.
run_standard_integration = function(sc_objs, ndims=30, vars_to_regress=c()) {
# Find integration anchors for assay RNA
integrate_RNA_features = Seurat::SelectIntegrationFeatures(object.list=sc_objs, nfeatures=3000, assay=rep("SCT", length(sc)), verbose=FALSE)
integrate_RNA_anchors = Seurat::FindIntegrationAnchors(object.list=sc_objs, normalization.method="LogNormalize", anchor.features=integrate_RNA_features, dims=1:ndims, assay=rep("RNA", length(sc_objs)), verbose=FALSE)
tmp_sc_objs = Seurat::IntegrateData(integrate_RNA_anchors, new.assay.name="RNAintegrated", normalization.method="LogNormalize", dims=1:ndims, verbose=FALSE)
sc_obj_RNAintegrated_assay = Seurat::GetAssay(tmp_sc_objs, assay="RNAintegrated")
rm(tmp_sc_objs, integrate_RNA_anchors, integrate_RNA_features)
# Find integration anchors for assay SCT
integrate_SCT_features = SelectIntegrationFeatures(object.list=sc_objs, nfeatures=3000, assay=rep("SCT", length(sc)), verbose=FALSE)
sc_objs = PrepSCTIntegration(object.list=sc_objs, anchor.features=integrate_SCT_features, verbose=TRUE)
integrate_SCT_anchors = FindIntegrationAnchors(object.list=sc_objs, normalization.method="SCT", anchor.features=integrate_SCT_features, assay=rep("SCT", length(sc_objs)), verbose=FALSE)
sc_objs = Seurat::IntegrateData(integrate_SCT_anchors, new.assay.name="SCTintegrated", normalization.method="SCT", dims=1:ndims, verbose=FALSE)
sc_objs[["RNAintegrated"]] = sc_obj_RNAintegrated_assay
rm(integrate_SCT_anchors, integrate_SCT_features, sc_obj_RNAintegrated_assay)
# scale data: according to seurat, this is needed only for the integrated RNA assay
sc_objs = Seurat::ScaleData(sc_objs, features=rownames(sc_objs[["RNAintegrated"]]), verbose=FALSE, vars.to.regress=vars_to_regress, assay="RNAintegrated")
# Call garbage collector to free memory (hope it helps)
gc()
return(sc_objs)
}
# call function
sc = run_standard_integration(sc, ndims=param$integrate_samples[["dimensions"]], vars_to_regress=param$vars_to_regress)
}if (param$integrate_samples[["method"]]!="single") {
# Markdown text for this section (do not change intendation)
cat("\n\n### Relative log expression after integration {.tabset}\n\n")
cat("#### Standard normalisation (RNA)\n\n")
# Get normalised data from assay RNAintegrated and plot RLE
cell_subset_norm1 = GetAssayData(subset(sc, cells=purrr::flatten_chr(cells_RLE_subset), features=shared_rna_features), assay="RNAintegrated", slot="data")
PlotRLE(as.matrix(cell_subset_norm1), id=cells_orig_sample, col=param$col_samples) + labs(title="Normalised integrated data")
}if (param$integrate_samples[["method"]]!="single") {
# Markdown text for this section (do not change intendation)
cat("\n\n#### Advanced normalisation (SCT)\n\n")
# Get normalised data from assay SCTintegrated and plot RLE
cell_subset_norm2 = GetAssayData(subset(sc, cells=purrr::flatten_chr(cells_RLE_subset), features=shared_sct_features), assay="SCTintegrated", slot="data")
PlotRLE(as.matrix(cell_subset_norm2), id=cells_orig_sample, col=param$col_samples) + labs(title="SCTransform'ed integrated data")
}if (param$integrate_samples[["method"]]!="single") {
# This is neccessary for closing the tabset; leave intendation
#cat("\n\n</div>\n\n")
# Add feature metadata
# Note: sc is now not a list anymore so that we do not use purrr::map
for (a in Seurat::Assays(sc)) {
if (a %in% names(feature_data_for_assay)) {
sc[[a]] = Seurat::AddMetaData(sc[[a]], feature_data_for_assay[[a]][rownames(sc[[a]]),, drop=FALSE])
}
}
}
# Set default assay (will be the integrated version)
DefaultAssay(sc) = paste0(param$normalisation_default,"integrated")A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.
We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.
To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.
# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features =Seurat::VariableFeatures(object=sc))
p = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE)
for (i in 1:length(p)) p[[i]] = PlotMystyle(p[[i]])
p = patchwork::wrap_plots(p, ncol = 2) + patchwork::plot_annotation("Top gene loadings of the first two PCs")
pp = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples)
p = PlotMystyle(p, title="Cells arranged by the first two PCs", legend_position="bottom")
pWe next need to decide how many PCs we want to use for our analyses. The following “Elbow plot” is designed to help us make an informed decision. PCs are ranked based on the percentage of variance they explain.
For your dataset, we decided to go for 10 PCs.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=20)
p = PlotMystyle(p, title="Elbow plot")
pSeurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm.
# Note: I changed the seed in ./lib/python3.6/site-packages/leidenalg/functions.py to 11 for reproducibility
# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n)
# Cluster using the Leiden algorithm
# Paper to Leiden algorithm: https://www.nature.com/articles/s41598-019-41695-z
# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
sc = Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4)
# Set up colors for clusters
cluster_names = levels(sc$seurat_clusters)
param$col_clusters = GenerateColours(num_colours=length(cluster_names), palette=param$col_palette_clusters)
names(param$col_clusters) = cluster_namesWe use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.
Take care not to mis-read a UMAP:
For a nice read to intuitively understand UMAP, see Unknown (2020b).
# Default UMAP
sc = Seurat::RunUMAP(sc, dims=1:param$pc_n)
# 3D UMAP
sc = Seurat::RunUMAP(sc, dims=1:param$pc_n, n.components=3, reduction.name="umap3d", reduction.key="UMAP3D_")
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE) + scale_colour_manual("Cluster", labels=cluster_labels)
p = PlotMystyle(p, "UMAP, cells coloured by cluster identity", legend_position="bottom", col=param$col_clusters)
p# Add a UMAP that is coloured by sample of origin
if (nrow(param$path_data) > 1) {
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE, group.by="orig.ident")
p = PlotMystyle(p, "UMAP, cells coloured by sample of origin", legend_position="bottom", col=param$col_samples)
print(p)
}# Count cells per cluster per sample
tbl = lapply(unique(sc[[]]$orig.ident), function(i) {
tmp = sc[[]] %>% dplyr::filter(orig.ident==i) %>% dplyr::count(seurat_clusters) %>%
dplyr::mutate(perc=round(n/sum(n)*100,2))
colnames(tmp)[2:ncol(tmp)] = paste(colnames(tmp)[2:ncol(tmp)], i, sep="_")
return(tmp)
}) %>%
purrr::reduce(dplyr::full_join, by="seurat_clusters")
knitr::kable(tbl, align="l", caption="Number of cells per cluster per sample") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.
# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) +
geom_point() +
scale_x_continuous("G1/S score") +
scale_y_continuous("G2/M score")
p1 = PlotMystyle(p1)
p2 = ggplot(sc@meta.data %>%
dplyr::group_by(seurat_clusters,Phase) %>%
dplyr::summarise(num_reads=length(Phase)),
aes(x=seurat_clusters, y=num_reads, fill=Phase)) +
geom_bar(stat="identity", position="fill") +
scale_x_discrete("Seurat clusters") +
scale_y_continuous("Fraction of cells")
p2 = PlotMystyle(p2)
p = p1 + p2 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases")
p# UMAP with phases superimposed
# Note: This is a hack to colour by phase but label by Cluster
p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1)
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data),]
p = LabelClusters(p, id = "seurat_clusters")
PlotMystyle(p, title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")p = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", label=TRUE, cols=c("lightgrey", param$col))
p = PlotMystyle(p, title="UMAP, cells coloured by S phase")
pp = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE)
p = PlotMystyle(p, title="UMAP, cells coloured by G2M phase")
pp = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col), label=TRUE)
p = PlotMystyle(p, title="UMAP, cells coloured by CC.Difference")
pDo cells in individual clusters have particularly high counts, detected genes or mitochondrial content?
p = Seurat::FeaturePlot(sc, features="nCount_RNA", cols=c("lightgrey", param$col), label=TRUE)
p = PlotMystyle(p, title="nCount_RNA")
pDo cells in individual clusters express provided known marker genes?
if(!is.null(param$file_known_markers)) {
# Read known marker genes
known_markers = openxlsx::read.xlsx(param$file_known_markers)
known_markers_list = lapply(colnames(known_markers), function(x) {
y = ensembl_to_seurat_rowname[known_markers[,x]] %>%
na.exclude() %>% unique() %>% sort()
return(y)
})
names(known_markers_list) = colnames(known_markers)
known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
idx_dotplot = sapply(1:length(known_markers_list), function(x) length(known_markers_list[[x]]) <=50)
idx_avgplot = sapply(1:length(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
fig_height_knownMarkers_dotplot = max(5, 5 * sum(idx_dotplot))
fig_height_knownMarkers_avgplot = max(5, 5 * sum(idx_avgplot))
fig_height_knownMarkers_vect = max(5, length(known_markers_vect))
known_markers_n = length(known_markers_list)
} else {
known_markers_n = 0
}You provided 7 list(s) of known marker genes. In the following tabs, you find:
A dot plot visualizes how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that express the gene, while the color encodes the average expression across all cells within the cluster.
if (any(idx_dotplot)) {
known_markers_dotplot = known_markers_list[idx_dotplot]
p = list()
for (i in 1:length(known_markers_dotplot)) {
g = known_markers_dotplot[[i]]
g = g[length(g):1]
p[[i]] = Seurat::DotPlot(sc, features=g, cols=c("lightgrey", param$col)) + ylab("Cluster")
p[[i]] = PlotMystyle(p[[i]], title=paste("Known marker genes:", names(known_markers_dotplot)[i])) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
}
p = patchwork::wrap_plots(p, ncol=1)
p
} else {
print("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
}An average feature plot visualizes the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.
if (any(idx_avgplot)) {
known_markers_avgplot = known_markers_list[idx_avgplot]
sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, assay="SCT", ctrl=10, name="known_markers")
idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
p = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in 1:length(known_markers_avgplot)) p[[i]] = PlotMystyle(p[[i]],
title=paste("Known marker genes:",
names(known_markers_avgplot)[i]))
p = patchwork::wrap_plots(p, ncol=1)
p
} else {
print("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
}## [1] "This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped."
An individual feature plot colours single cells on the UMAP according to their normalised gene expression.
if (length(known_markers_vect) <= 30) {
p = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
for (i in 1:length(p)) p[[i]] = PlotMystyle(p[[i]])
patchwork::wrap_plots(p, ncol=2)
} else {
print("This tab is used to plot up to 30 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
}We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw “RNA” data and the method “MAST”. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.
# We load and unload the MAST R package in this chunk, as it overwrites Seurat functions
library(MAST)
# Find markers for every cluster compared to all remaining cells, report positive and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers
# Should cell cycle effects be ignored?
latent_vars=NULL
if(param$cc_remove) {
if(param$cc_remove_all) {
latent_vars = c("S.Score", "G2M.Score")
} else {
latent_vars = "CC.Difference"
}
}
# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
sc_markers = Seurat::FindAllMarkers(sc, assay="RNA", test.use="MAST",
only.pos=FALSE, min.pct=0.25, logfc.threshold=0.25,
latent.vars=param$latent_vars)
sc_markers_top2 = sc_markers %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=2, wt=avg_logFC) %>%
as.data.frame()
# Show top 2 merkers per cluster
knitr::kable(sc_markers_top2, align="l", caption="Top 2 DEGs per cell cluster") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")| p_val | avg_logFC | pct.1 | pct.2 | p_val_adj | cluster | gene |
|---|---|---|---|---|---|---|
| 0 | 0.9913495 | 0.783 | 0.419 | 0 | 1 | DUSP2 |
| 0 | 1.2146775 | 0.394 | 0.089 | 0 | 1 | GZMK |
| 0 | 1.3311606 | 0.963 | 0.346 | 0 | 2 | LTB |
| 0 | 1.2017087 | 0.961 | 0.304 | 0 | 2 | IL7R |
| 0 | 1.6791687 | 0.992 | 0.282 | 0 | 3 | GNLY |
| 0 | 0.8093787 | 0.996 | 0.531 | 0 | 3 | CCL5.1 |
| 0 | 2.7707200 | 1.000 | 0.018 | 0 | 4 | CD79A |
| 0 | 3.5828496 | 0.608 | 0.015 | 0 | 4 | IGKC |
| 0 | 4.1959026 | 0.998 | 0.062 | 0 | 5 | S100A9 |
| 0 | 4.1330835 | 1.000 | 0.102 | 0 | 5 | LYZ |
| 0 | 2.0929338 | 0.989 | 0.308 | 0 | 6 | GNLY |
| 0 | 1.6593385 | 0.976 | 0.357 | 0 | 6 | PRF1 |
| 0 | 1.3116630 | 0.997 | 0.351 | 0 | 7 | GZMH |
| 0 | 1.4312706 | 0.976 | 0.316 | 0 | 7 | FGFBP2 |
| 0 | 1.1642258 | 0.720 | 0.077 | 0 | 8 | LEF1 |
| 0 | 1.1937578 | 0.747 | 0.093 | 0 | 8 | CCR7 |
| 0 | 1.9924267 | 0.949 | 0.192 | 0 | 9 | KLRB1 |
| 0 | 1.4526692 | 0.602 | 0.104 | 0 | 9 | AQP3 |
| 0 | 2.7295526 | 1.000 | 0.198 | 0 | 10 | LST1 |
| 0 | 2.4820210 | 1.000 | 0.166 | 0 | 10 | AIF1 |
| 0 | 5.1206311 | 0.926 | 0.017 | 0 | 11 | PPBP |
| 0 | 4.8122067 | 0.882 | 0.076 | 0 | 11 | NRGN |
| 0 | 2.5224098 | 1.000 | 0.550 | 0 | 12 | HLA-DPA1.2 |
| 0 | 2.5958110 | 1.000 | 0.186 | 0 | 12 | CST3 |
# Add Ensembl annotation
sc_markers_ensembl = seurat_rowname_to_ensembl[sc_markers[,"gene"]]
sc_markers_annot = cbind(sc_markers, annot_ensembl[sc_markers_ensembl,])
# Output in Excel sheet
sc_markers_lst = lapply(levels(sc_markers_annot$cluster), function(x) {sc_markers_annot %>% dplyr::filter(cluster==x)})
names(sc_markers_lst) = paste0("cluster", levels(sc_markers$cluster))
openxlsx::write.xlsx(sc_markers_lst, file=paste0(param$path_out, "/markers.xlsx"))
# Filter markers based on p-value and fold-change
sc_markers_filt = sc_markers %>%
dplyr::filter(p_val_adj <= param$padj) %>%
dplyr::filter((avg_logFC <= -param$log2fc) | (avg_logFC >= param$log2fc)) %>%
as.data.frame()
sc_markers_filt_down = sc_markers_filt %>%
dplyr::filter(avg_logFC <= -param$log2fc) %>%
as.data.frame()
sc_markers_filt_up = sc_markers_filt %>%
dplyr::filter(avg_logFC >= param$log2fc) %>%
as.data.frame()
# Number of DEGs per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
sc_markers_filt_n = cbind(Cluster=cluster_all,
Down=sapply(cluster_all, function(x) sum(sc_markers_filt_down$cluster == x)),
Up=sapply(cluster_all, function(x) sum(sc_markers_filt_up$cluster == x))) %>%
as.data.frame() %>%
tidyr::pivot_longer(cols=c("Down", "Up"),
names_to="Direction",
values_to="n")
sc_markers_filt_n$Cluster = as.factor(sc_markers_filt_n$Cluster)
p = ggplot(sc_markers_filt_n, aes(x=Cluster, y=n, fill=Direction)) + geom_bar(stat="identity")
p = PlotMystyle(p,
title=paste0("Number of DEGs per cell cluster\n(FC=", 2^param$log2fc, ", adj. p-value=", param$padj, ")"),
fill=c("steelblue", "darkgoldenrod1"))
pThe following plots are exemplary to how we can visualize differentially expressed genes using the Seurat R-package. The selected genes are the top differentially expressed genes for each cluster, respectively.
# Get top gene per cluster and plot
genes_example = sc_markers %>%
dplyr::group_by(cluster) %>%
dplyr::top_n(n=1, wt=avg_logFC) %>%
dplyr::pull(gene) %>%
unique()
fig_height_deg = max(5, length(genes_example))# Shows gene expression on the UMAP
p = Seurat::FeaturePlot(sc, features=genes_example, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i)
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data")
p# Ridge plot of raw gene expression counts
p = Seurat::RidgePlot(sc, features=genes_example, slot="counts", combine=FALSE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i, legend_title="Cell identity", fill=param$col_clusters)
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of raw gene expression counts") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
p# Ridge plot of normalised gene expression data
p = Seurat::RidgePlot(sc, features=genes_example, combine=FALSE)
names(p) = genes_example
for (i in names(p)) p[[i]] = PlotMystyle(p[[i]], title=i, legend_title="Cell identity", fill=param$col_clusters)
p = patchwork::wrap_plots(p, ncol=2) +
patchwork::plot_annotation(title="Ridge plot of normalised gene expression data") +
patchwork::plot_layout(guides = "collect") &
theme(legend.position="bottom")
p# Visualises how feature expression changes across different clusters
p = Seurat::DotPlot(sc, features=genes_example[length(genes_example):1], cols=c("lightgrey", param$col))
p = PlotMystyle(p, title="Dot plot of normalised gene expression data") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))
pTo gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.
We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Unknown 2020c). You can choose to test functional enrichment from a wide range of databases:
dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>%
kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="300px")| geneCoverage | genesPerTerm | libraryName | link | numTerms |
|---|---|---|---|---|
| 13362 | 275 | Genome_Browser_PWMs | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ | 615 |
| 27884 | 1284 | TRANSFAC_and_JASPAR_PWMs | http://jaspar.genereg.net/html/DOWNLOAD/ | 326 |
| 6002 | 77 | Transcription_Factor_PPIs | 290 | |
| 47172 | 1370 | ChEA_2013 | http://amp.pharm.mssm.edu/lib/cheadownload.jsp | 353 |
| 47107 | 509 | Drug_Perturbations_from_GEO_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 701 |
| 21493 | 3713 | ENCODE_TF_ChIP-seq_2014 | http://genome.ucsc.edu/ENCODE/downloads.html | 498 |
| 1295 | 18 | BioCarta_2013 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 249 |
| 3185 | 73 | Reactome_2013 | http://www.reactome.org/download/index.html | 78 |
| 2854 | 34 | WikiPathways_2013 | http://www.wikipathways.org/index.php/Download_Pathways | 199 |
| 15057 | 300 | Disease_Signatures_from_GEO_up_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 4128 | 48 | KEGG_2013 | http://www.kegg.jp/kegg/download/ | 200 |
| 34061 | 641 | TF-LOF_Expression_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 269 |
| 7504 | 155 | TargetScan_microRNA | http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 | 222 |
| 16399 | 247 | PPI_Hub_Proteins | http://amp.pharm.mssm.edu/X2K | 385 |
| 12753 | 57 | GO_Molecular_Function_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 1136 |
| 23726 | 127 | GeneSigDB | http://genesigdb.org/genesigdb/downloadall.jsp | 2139 |
| 32740 | 85 | Chromosome_Location | http://software.broadinstitute.org/gsea/msigdb/index.jsp | 386 |
| 13373 | 258 | Human_Gene_Atlas | http://biogps.org/downloads/ | 84 |
| 19270 | 388 | Mouse_Gene_Atlas | http://biogps.org/downloads/ | 96 |
| 13236 | 82 | GO_Cellular_Component_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 641 |
| 14264 | 58 | GO_Biological_Process_2015 | http://www.geneontology.org/GO.downloads.annotations.shtml | 5192 |
| 3096 | 31 | Human_Phenotype_Ontology | http://www.human-phenotype-ontology.org/ | 1779 |
| 22288 | 4368 | Epigenomics_Roadmap_HM_ChIP-seq | http://www.roadmapepigenomics.org/ | 383 |
| 4533 | 37 | KEA_2013 | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 474 |
| 10231 | 158 | NURSA_Human_Endogenous_Complexome | https://www.nursa.org/nursa/index.jsf | 1796 |
| 2741 | 5 | CORUM | http://mips.helmholtz-muenchen.de/genre/proj/corum/ | 1658 |
| 5655 | 342 | SILAC_Phosphoproteomics | http://amp.pharm.mssm.edu/lib/keacommandline.jsp | 84 |
| 10406 | 715 | MGI_Mammalian_Phenotype_Level_3 | http://www.informatics.jax.org/ | 71 |
| 10493 | 200 | MGI_Mammalian_Phenotype_Level_4 | http://www.informatics.jax.org/ | 476 |
| 11251 | 100 | Old_CMAP_up | http://www.broadinstitute.org/cmap/ | 6100 |
| 8695 | 100 | Old_CMAP_down | http://www.broadinstitute.org/cmap/ | 6100 |
| 1759 | 25 | OMIM_Disease | http://www.omim.org/downloads | 90 |
| 2178 | 89 | OMIM_Expanded | http://www.omim.org/downloads | 187 |
| 851 | 15 | VirusMINT | http://mint.bio.uniroma2.it/download.html | 85 |
| 10061 | 106 | MSigDB_Computational | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 858 |
| 11250 | 166 | MSigDB_Oncogenic_Signatures | http://www.broadinstitute.org/gsea/msigdb/collections.jsp | 189 |
| 15406 | 300 | Disease_Signatures_from_GEO_down_2014 | http://www.ncbi.nlm.nih.gov/geo/ | 142 |
| 17711 | 300 | Virus_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 17576 | 300 | Virus_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 323 |
| 15797 | 176 | Cancer_Cell_Line_Encyclopedia | https://portals.broadinstitute.org/ccle/home | 967 |
| 12232 | 343 | NCI-60_Cancer_Cell_Lines | http://biogps.org/downloads/ | 93 |
| 13572 | 301 | Tissue_Protein_Expression_from_ProteomicsDB | https://www.proteomicsdb.org/ | 207 |
| 6454 | 301 | Tissue_Protein_Expression_from_Human_Proteome_Map | http://www.humanproteomemap.org/index.php | 30 |
| 3723 | 47 | HMDB_Metabolites | http://www.hmdb.ca/downloads | 3906 |
| 7588 | 35 | Pfam_InterPro_Domains | ftp://ftp.ebi.ac.uk/pub/databases/interpro/ | 311 |
| 7682 | 78 | GO_Biological_Process_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 941 |
| 7324 | 172 | GO_Cellular_Component_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 205 |
| 8469 | 122 | GO_Molecular_Function_2013 | http://www.geneontology.org/GO.downloads.annotations.shtml | 402 |
| 13121 | 305 | Allen_Brain_Atlas_up | http://www.brain-map.org/ | 2192 |
| 26382 | 1811 | ENCODE_TF_ChIP-seq_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 816 |
| 29065 | 2123 | ENCODE_Histone_Modifications_2015 | http://genome.ucsc.edu/ENCODE/downloads.html | 412 |
| 280 | 9 | Phosphatase_Substrates_from_DEPOD | http://www.koehn.embl.de/depod/ | 59 |
| 13877 | 304 | Allen_Brain_Atlas_down | http://www.brain-map.org/ | 2192 |
| 15852 | 912 | ENCODE_Histone_Modifications_2013 | http://genome.ucsc.edu/ENCODE/downloads.html | 109 |
| 4320 | 129 | Achilles_fitness_increase | http://www.broadinstitute.org/achilles | 216 |
| 4271 | 128 | Achilles_fitness_decrease | http://www.broadinstitute.org/achilles | 216 |
| 10496 | 201 | MGI_Mammalian_Phenotype_2013 | http://www.informatics.jax.org/ | 476 |
| 1678 | 21 | BioCarta_2015 | https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 239 |
| 756 | 12 | HumanCyc_2015 | http://humancyc.org/ | 125 |
| 3800 | 48 | KEGG_2015 | http://www.kegg.jp/kegg/download/ | 179 |
| 2541 | 39 | NCI-Nature_2015 | http://pid.nci.nih.gov/ | 209 |
| 1918 | 39 | Panther_2015 | http://www.pantherdb.org/ | 104 |
| 5863 | 51 | WikiPathways_2015 | http://www.wikipathways.org/index.php/Download_Pathways | 404 |
| 6768 | 47 | Reactome_2015 | http://www.reactome.org/download/index.html | 1389 |
| 25651 | 807 | ESCAPE | http://www.maayanlab.net/ESCAPE/ | 315 |
| 19129 | 1594 | HomoloGene | http://www.ncbi.nlm.nih.gov/homologene | 12 |
| 23939 | 293 | Disease_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23561 | 307 | Disease_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 839 |
| 23877 | 302 | Drug_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 15886 | 9 | Genes_Associated_with_NIH_Grants | https://grants.nih.gov/grants/oer.htm | 32876 |
| 24350 | 299 | Drug_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 906 |
| 3102 | 25 | KEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 428 |
| 31132 | 298 | Gene_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 30832 | 302 | Gene_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 2460 |
| 48230 | 1429 | ChEA_2015 | http://amp.pharm.mssm.edu/Enrichr | 395 |
| 5613 | 36 | dbGaP | http://www.ncbi.nlm.nih.gov/gap | 345 |
| 9559 | 73 | LINCS_L1000_Chem_Pert_up | https://clue.io/ | 33132 |
| 9448 | 63 | LINCS_L1000_Chem_Pert_down | https://clue.io/ | 33132 |
| 16725 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_down | http://www.gtexportal.org/ | 2918 |
| 19249 | 1443 | GTEx_Tissue_Sample_Gene_Expression_Profiles_up | http://www.gtexportal.org/ | 2918 |
| 15090 | 282 | Ligand_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 16129 | 292 | Aging_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15309 | 308 | Aging_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 286 |
| 15103 | 318 | Ligand_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 261 |
| 15022 | 290 | MCF7_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15676 | 310 | MCF7_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 401 |
| 15854 | 279 | Microbe_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 15015 | 321 | Microbe_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 312 |
| 3788 | 159 | LINCS_L1000_Ligand_Perturbations_down | https://clue.io/ | 96 |
| 3357 | 153 | LINCS_L1000_Ligand_Perturbations_up | https://clue.io/ | 96 |
| 12668 | 300 | L1000_Kinase_and_GPCR_Perturbations_down | https://clue.io/ | 3644 |
| 12638 | 300 | L1000_Kinase_and_GPCR_Perturbations_up | https://clue.io/ | 3644 |
| 8973 | 64 | Reactome_2016 | http://www.reactome.org/download/index.html | 1530 |
| 7010 | 87 | KEGG_2016 | http://www.kegg.jp/kegg/download/ | 293 |
| 5966 | 51 | WikiPathways_2016 | http://www.wikipathways.org/index.php/Download_Pathways | 437 |
| 15562 | 887 | ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X | 104 | |
| 17850 | 300 | Kinase_Perturbations_from_GEO_down | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 17660 | 300 | Kinase_Perturbations_from_GEO_up | http://www.ncbi.nlm.nih.gov/geo/ | 285 |
| 1348 | 19 | BioCarta_2016 | http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways | 237 |
| 934 | 13 | HumanCyc_2016 | http://humancyc.org/ | 152 |
| 2541 | 39 | NCI-Nature_2016 | http://pid.nci.nih.gov/ | 209 |
| 2041 | 42 | Panther_2016 | http://www.pantherdb.org/pathway/ | 112 |
| 5209 | 300 | DrugMatrix | https://ntp.niehs.nih.gov/drugmatrix/ | 7876 |
| 49238 | 1550 | ChEA_2016 | http://amp.pharm.mssm.edu/Enrichr | 645 |
| 2243 | 19 | huMAP | http://proteincomplexes.org/ | 995 |
| 19586 | 545 | Jensen_TISSUES | http://tissues.jensenlab.org/ | 1842 |
| 22440 | 505 | RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO | http://www.ncbi.nlm.nih.gov/geo/ | 1302 |
| 8184 | 24 | MGI_Mammalian_Phenotype_2017 | http://www.informatics.jax.org/ | 5231 |
| 18329 | 161 | Jensen_COMPARTMENTS | http://compartments.jensenlab.org/ | 2283 |
| 15755 | 28 | Jensen_DISEASES | http://diseases.jensenlab.org/ | 1811 |
| 10271 | 22 | BioPlex_2017 | http://bioplex.hms.harvard.edu/ | 3915 |
| 10427 | 38 | GO_Cellular_Component_2017 | http://www.geneontology.org/ | 636 |
| 10601 | 25 | GO_Molecular_Function_2017 | http://www.geneontology.org/ | 972 |
| 13822 | 21 | GO_Biological_Process_2017 | http://www.geneontology.org/ | 3166 |
| 8002 | 143 | GO_Cellular_Component_2017b | http://www.geneontology.org/ | 816 |
| 10089 | 45 | GO_Molecular_Function_2017b | http://www.geneontology.org/ | 3271 |
| 13247 | 49 | GO_Biological_Process_2017b | http://www.geneontology.org/ | 10125 |
| 21809 | 2316 | ARCHS4_Tissues | http://amp.pharm.mssm.edu/archs4 | 108 |
| 23601 | 2395 | ARCHS4_Cell-lines | http://amp.pharm.mssm.edu/archs4 | 125 |
| 20883 | 299 | ARCHS4_IDG_Coexp | http://amp.pharm.mssm.edu/archs4 | 352 |
| 19612 | 299 | ARCHS4_Kinases_Coexp | http://amp.pharm.mssm.edu/archs4 | 498 |
| 25983 | 299 | ARCHS4_TFs_Coexp | http://amp.pharm.mssm.edu/archs4 | 1724 |
| 19500 | 137 | SysMyo_Muscle_Gene_Sets | http://sys-myo.rhcloud.com/ | 1135 |
| 14893 | 128 | miRTarBase_2017 | http://mirtarbase.mbc.nctu.edu.tw/ | 3240 |
| 17598 | 1208 | TargetScan_microRNA_2017 | http://www.targetscan.org/ | 683 |
| 5902 | 109 | Enrichr_Libraries_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 121 |
| 12486 | 299 | Enrichr_Submissions_TF-Gene_Coocurrence | http://amp.pharm.mssm.edu/Enrichr | 1722 |
| 1073 | 100 | Data_Acquisition_Method_Most_Popular_Genes | http://amp.pharm.mssm.edu/Enrichr | 12 |
| 19513 | 117 | DSigDB | http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ | 4026 |
| 14433 | 36 | GO_Biological_Process_2018 | http://www.geneontology.org/ | 5103 |
| 8655 | 61 | GO_Cellular_Component_2018 | http://www.geneontology.org/ | 446 |
| 11459 | 39 | GO_Molecular_Function_2018 | http://www.geneontology.org/ | 1151 |
| 19741 | 270 | TF_Perturbations_Followed_by_Expression | http://www.ncbi.nlm.nih.gov/geo/ | 1958 |
| 27360 | 802 | Chromosome_Location_hg19 | http://hgdownload.cse.ucsc.edu/downloads.html | 36 |
| 13072 | 26 | NIH_Funded_PIs_2017_Human_GeneRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 5687 |
| 13464 | 45 | NIH_Funded_PIs_2017_Human_AutoRIF | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 13787 | 200 | Rare_Diseases_AutoRIF_ARCHS4_Predictions | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 13929 | 200 | Rare_Diseases_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 16964 | 200 | NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 12558 |
| 17258 | 200 | NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions | https://www.ncbi.nlm.nih.gov/pubmed/ | 5684 |
| 10352 | 58 | Rare_Diseases_GeneRIF_Gene_Lists | https://www.ncbi.nlm.nih.gov/gene/about-generif | 2244 |
| 10471 | 76 | Rare_Diseases_AutoRIF_Gene_Lists | https://amp.pharm.mssm.edu/geneshot/ | 3725 |
| 12419 | 491 | SubCell_BarCode | http://www.subcellbarcode.org/ | 104 |
| 19378 | 37 | GWAS_Catalog_2019 | https://www.ebi.ac.uk/gwas | 1737 |
| 6201 | 45 | WikiPathways_2019_Human | https://www.wikipathways.org/ | 472 |
| 4558 | 54 | WikiPathways_2019_Mouse | https://www.wikipathways.org/ | 176 |
| 3264 | 22 | TRRUST_Transcription_Factors_2019 | https://www.grnpedia.org/trrust/ | 571 |
| 7802 | 92 | KEGG_2019_Human | https://www.kegg.jp/ | 308 |
| 8551 | 98 | KEGG_2019_Mouse | https://www.kegg.jp/ | 303 |
| 12444 | 23 | InterPro_Domains_2019 | https://www.ebi.ac.uk/interpro/ | 1071 |
| 9000 | 20 | Pfam_Domains_2019 | https://pfam.xfam.org/ | 608 |
| 7744 | 363 | DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 | https://depmap.org/ | 558 |
| 6204 | 387 | DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 | https://depmap.org/ | 325 |
| 13420 | 32 | MGI_Mammalian_Phenotype_Level_4_2019 | http://www.informatics.jax.org/ | 5261 |
| 14148 | 122 | UK_Biobank_GWAS_v1 | https://www.ukbiobank.ac.uk/tag/gwas/ | 857 |
| 9813 | 49 | BioPlanet_2019 | https://tripod.nih.gov/bioplanet/ | 1510 |
| 1397 | 13 | ClinVar_2019 | https://www.ncbi.nlm.nih.gov/clinvar/ | 182 |
| 9116 | 22 | PheWeb_2019 | http://pheweb.sph.umich.edu/ | 1161 |
| 17464 | 63 | DisGeNET | https://www.disgenet.org | 9828 |
| 394 | 73 | HMS_LINCS_KinomeScan | http://lincs.hms.harvard.edu/kinomescan/ | 148 |
| 11851 | 586 | CCLE_Proteomics_2020 | https://portals.broadinstitute.org/ccle | 378 |
| 8189 | 421 | ProteomicsDB_2020 | https://www.proteomicsdb.org/ | 913 |
| 18704 | 100 | lncHUB_lncRNA_Co-Expression | https://amp.pharm.mssm.edu/lnchub/ | 3729 |
| 5605 | 39 | Virus-Host_PPI_P-HIPSTer_2020 | http://phipster.org/ | 6715 |
| 5718 | 31 | Elsevier_Pathway_Collection | http://www.transgene.ru/disease-pathways/ | 1721 |
| 14156 | 40 | Table_Mining_of_CRISPR_Studies | 802 |
# DEGs up and down per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
genesets_up = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_up %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
genesets_down = lapply(cluster_all, function(x) {
tmp = sc_markers_filt_down %>%
dplyr::filter(cluster==x) %>%
dplyr::pull(gene)
# Pick the first matching Entrez symbol
tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>%
na.exclude() %>% unique()
return(tmp)
})
names(genesets_up) = paste0("DEG_up_cluster_", cluster_all)
names(genesets_down) = paste0("DEG_down_cluster_", cluster_all)
genesets = c(genesets_up, genesets_down)
# Loop through gene lists
enriched = list()
for (i in 1:length(genesets)) {
if (length(genesets[[i]]) >= 3) {
message("Geneset ", names(genesets)[i])
enriched[[i]] = enrichR::enrichr(genesets[[i]], databases=param$enrichr_dbs)
} else {
message("Geneset ", names(genesets)[i], " has less than 3 genes; skip enrichr")
enriched[[i]] = NA
}
}
names(enriched) = names(genesets)
# Write enrichment results to file
enriched_top = matrix(NA, nrow=0, ncol=6)
colnames(enriched_top) = c("GeneSet", "Database", "Term", "Overlap", "Adjusted_pval", "Genes")
for (i in 1:length(enriched)) {
if (!is.null(enriched[[i]])) {
openxlsx::write.xlsx(enriched[[i]], file=paste0(param$path_out, "/Functions_", names(enriched)[i], ".xlsx"))
}
}The following table contains the top enriched term per geneset and database.
for (i in 1:length(enriched)) {
if ((!is.null(enriched[[i]])) & (!is.na(enriched[[i]]))) {
# Remember top term per geneset
for(j in 1:length(enriched[[i]])) {
enriched_top = rbind(enriched_top,
c(names(enriched)[i],
names(enriched[[i]])[j],
enriched[[i]][[j]][1, c("Term", "Overlap", "Adjusted.P.value", "Genes")]))
}
}
}
# Print table of top terms per gene set
knitr::kable(enriched_top, align="l", caption="Top enriched term per geneset") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::scroll_box(width="100%", height="700px")| GeneSet | Database | Term | Overlap | Adjusted_pval | Genes |
|---|---|---|---|---|---|
| DEG_up_cluster_1 | GO_Molecular_Function_2018 | MHC class I protein binding (GO:0042288) | 2/14 | 0.0052306512098949 | CD8B;CD8A |
| DEG_up_cluster_1 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 2/88 | 0.968354596870744 | CD8B;CD8A |
| DEG_up_cluster_1 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 2/16 | 0.0026722041846523 | CD8B;CD8A |
| DEG_up_cluster_2 | GO_Molecular_Function_2018 | intermediate filament binding (GO:0019215) | 1/11 | 1 | VIM |
| DEG_up_cluster_2 | GO_Biological_Process_2018 | cytokine-mediated signaling pathway (GO:0019221) | 6/633 | 0.0121089276129934 | GATA3;LTB;VIM;PLP2;IL7R;JUNB |
| DEG_up_cluster_2 | GO_Cellular_Component_2018 | membrane raft (GO:0045121) | 2/119 | 1 | LDHB;MAL |
| DEG_up_cluster_3 | GO_Molecular_Function_2018 | phospholipase activator activity (GO:0016004) | 1/6 | 1 | CCL5 |
| DEG_up_cluster_3 | GO_Biological_Process_2018 | lymphocyte mediated immunity (GO:0002449) | 2/11 | 0.0923412918635355 | CD8A;KLRD1 |
| DEG_up_cluster_3 | GO_Cellular_Component_2018 | platelet dense granule lumen (GO:0031089) | 1/14 | 1 | CTSW |
| DEG_up_cluster_4 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 6/16 | 5.05975921666157e-08 | CD74;HLA-DMA;HLA-DMB;HLA-DRA;MS4A1;HLA-DRB1 |
| DEG_up_cluster_4 | GO_Biological_Process_2018 | antigen receptor-mediated signaling pathway (GO:0050851) | 17/257 | 4.72487752063177e-12 | BLK;IGHM;MEF2C;CD79B;CD79A;IGHG1;IGKC;CD19;IGLC3;IGHD;HLA-DPB1;HLA-DRA;IGLC2;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_4 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 9/14 | 3.19242087086651e-16 | CD74;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_5 | GO_Molecular_Function_2018 | protein homodimerization activity (GO:0042803) | 31/664 | 0.00203330978499852 | CEBPA;CEBPB;BCL2A1;HEXB;LRRK2;PLEK;PYGL;TYMP;PYCARD;GLIPR2;PSAP;HMOX1;LRRFIP1;SRGAP2;S100A11;S100A10;MCL1;BNIP3L;FCER1G;WARS;GCA;STAT1;SLC11A1;ADA2;IRAK3;LILRB2;CCDC88A;CD4;JAML;S100A6;PECAM1 |
| DEG_up_cluster_5 | GO_Biological_Process_2018 | neutrophil activation involved in immune response (GO:0002283) | 91/483 | 8.57600779873021e-63 | FCN1;SERPINA1;ITGAM;HEXB;RAB3D;CTSZ;SLC2A3;PYGL;TCIRG1;CTSS;SIRPB1;PYCARD;LGALS3;GLIPR1;FTH1;ANPEP;LAMP2;TIMP2;COTL1;ITGAX;CTSH;SIRPA;QSOX1;CD36;RAC1;CTSD;CD33;CTSB;ACTR2;SERPINB1;FCER1G;SYK;ANXA2;CD93;SLC11A1;PLAUR;CYBB;RNASE2;TNFRSF1B;RHOA;CKAP4;OSCAR;FGR;RAP2B;TYROBP;RAB31;DOK3;NPC2;PECAM1;ALDOA;S100A9;S100A8;TLR2;FTL;CFD;GRN;ASAH1;CLEC12A;BRI3;GSTP1;STXBP2;C5AR1;FGL2;FPR1;MGST1;IQGAP1;CFP;GNS;CST3;SDCBP;CREG1;PSAP;S100A12;NCKAP1L;CD14;LTA4H;S100A11;GCA;FUCA1;CMTM6;ADA2;LILRB2;LYZ;LILRB3;CPPED1;BST1;FCGR2A;SELL;MNDA;PTPN6;CD68 |
| DEG_up_cluster_5 | GO_Cellular_Component_2018 | tertiary granule (GO:0070820) | 35/164 | 8.50038554983416e-25 | ASAH1;ITGAM;CLEC12A;STXBP2;FPR1;SLC2A3;TCIRG1;CFP;CTSS;CST3;LGALS3;FTH1;LAMP2;TIMP2;ITGAX;SIRPA;CTSH;NCKAP1L;QSOX1;LTA4H;RAC1;CTSD;CD33;FCER1G;CD93;SLC11A1;CYBB;LILRB2;LYZ;RHOA;OSCAR;DOK3;RAP2B;PTPN6;ALDOA |
| DEG_up_cluster_6 | GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | 3/19 | 0.0190179456163385 | KLRC2;KLRD1;KLRC1 |
| DEG_up_cluster_6 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 12/251 | 1.01266899694941e-08 | FCGR3A;NCR3;TYROBP;KLRB1;ITGB2;KLRF1;KLRD1;KLRC1;KIR3DL2;CD247;HCST;KIR2DL3 |
| DEG_up_cluster_6 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 2/16 | 0.360090854988048 | ZAP70;CD247 |
| DEG_up_cluster_7 | GO_Molecular_Function_2018 | endopeptidase activity (GO:0004175) | 5/354 | 0.0626278450321713 | GZMM;GZMA;GZMB;CTSW;GZMH |
| DEG_up_cluster_7 | GO_Biological_Process_2018 | regulation of immune response (GO:0050776) | 8/251 | 1.73652368233746e-06 | CD8B;CD8A;TRAC;TRBC1;CD3G;KLRD1;CD3D;CD99 |
| DEG_up_cluster_7 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 4/16 | 1.28157914616563e-06 | CD8B;CD8A;CD3G;CD3D |
| DEG_up_cluster_8 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 12/1387 | 0.00177473587265024 | RPL30;NPM1;RPL32;RPS8;RPS5;RPL22;RCAN3;RPL13;NOSIP;RPS3A;EIF3E;RPS13 |
| DEG_up_cluster_8 | GO_Biological_Process_2018 | nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) | 9/112 | 9.29283022220641e-10 | RPL30;RPL32;RPS8;RPS5;RPL22;RPL13;RPS3A;EIF3E;RPS13 |
| DEG_up_cluster_8 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 8/124 | 1.22884997243549e-08 | RPL30;RPL32;RPS8;RPS5;RPL22;RPL13;RPS3A;RPS13 |
| DEG_up_cluster_9 | GO_Molecular_Function_2018 | C-C chemokine binding (GO:0019957) | 2/8 | 0.0898232418047339 | ZFP36;CXCR4 |
| DEG_up_cluster_9 | GO_Biological_Process_2018 | negative regulation of I-kappaB kinase/NF-kappaB signaling (GO:0043124) | 4/47 | 0.00601433425946692 | NFKBIA;PER1;TNFAIP3;RORA |
| DEG_up_cluster_9 | GO_Cellular_Component_2018 | transcription factor TFIIIC complex (GO:0000127) | 1/6 | 1 | GTF3C1 |
| DEG_up_cluster_10 | GO_Molecular_Function_2018 | protein kinase binding (GO:0019901) | 22/495 | 0.00106947791409088 | CSF1R;TCF7L2;WARS;MAP3K1;CTBP2;H2AFY;GSTP1;PLEK;RHOG;MSN;AP2A1;RHOC;ACTB;RHOA;RHOB;FGR;MAPKAPK3;CSK;PTPN6;PKN1;CALM2;CACUL1 |
| DEG_up_cluster_10 | GO_Biological_Process_2018 | neutrophil degranulation (GO:0043312) | 56/479 | 8.36993207998367e-33 | FCN1;SERPINA1;CTSZ;GDI2;TCIRG1;CTSS;PYCARD;HK3;FTH1;TIMP2;COTL1;ITGAX;CTSC;CTSB;CAP1;FCER1G;ANXA2;SLC11A1;RNASET2;RHOG;PLAUR;CYBB;TNFRSF1B;RHOA;FGR;TYROBP;RAB31;NPC2;CAT;PECAM1;ALDOA;FTL;CFD;CSTB;ASAH1;CLEC12A;BRI3;GSTP1;STXBP2;C5AR1;FGL2;CFP;GNS;CST3;PSAP;LTA4H;S100A11;CMTM6;LILRB2;ARPC5;LILRB3;CPPED1;TSPAN14;FCGR2A;PTPN6;CD68 |
| DEG_up_cluster_10 | GO_Cellular_Component_2018 | ficolin-1-rich granule (GO:0101002) | 26/184 | 1.10335422560136e-16 | FCN1;CFD;CSTB;ASAH1;SERPINA1;GSTP1;CTSZ;FGL2;TCIRG1;GNS;CTSS;CST3;HK3;FTH1;COTL1;TIMP2;ITGAX;LTA4H;CTSB;FCER1G;SLC11A1;ARPC5;LILRB2;RHOA;CAT;ALDOA |
| DEG_up_cluster_11 | GO_Molecular_Function_2018 | actin binding (GO:0003779) | 19/254 | 3.3062051584876e-05 | ITGB1;DBNL;DMTN;GSN;TPM4;WDR1;WIPF1;TPM3;ACTN1;ARPC1B;TPM1;PARVB;ARPC5;TMSB4X;COTL1;FLNA;MYH9;ALDOA;VCL |
| DEG_up_cluster_11 | GO_Biological_Process_2018 | platelet degranulation (GO:0002576) | 25/124 | 5.85076042390064e-17 | APP;CD63;SPARC;WDR1;ITGB3;ITGA2B;STXBP2;F13A1;CLU;THBS1;TMSB4X;FLNA;TIMP1;SRGN;TGFB1;ACTN1;PPBP;TUBA4A;CD9;TAGLN2;TLN1;ALDOA;VCL;FERMT3;PF4 |
| DEG_up_cluster_11 | GO_Cellular_Component_2018 | secretory granule lumen (GO:0034774) | 31/317 | 3.11697094129311e-13 | APP;SPARC;F13A1;CLU;THBS1;CYB5R3;TMSB4X;COTL1;TIMP1;OSTF1;CTSA;SRGN;SERPINB1;DBNL;GSN;TGFB1;TRAPPC1;ACTN1;PPBP;ARPC5;TUBB4B;PRDX6;ARHGAP45;PKM;BIN2;LCN2;ALDOA;VCL;FERMT3;PF4;FTL |
| DEG_up_cluster_12 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 6/16 | 1.98163490767778e-06 | CD74;HLA-DMA;HLA-DMB;HLA-DRA;HLA-DOA;HLA-DRB1 |
| DEG_up_cluster_12 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen via MHC class II (GO:0019886) | 15/97 | 7.37013560695357e-12 | CD74;HLA-DRB5;FCER1G;HLA-DMA;HLA-DMB;AP1S2;AP2S1;HLA-DPB1;HLA-DRA;HLA-DOA;HLA-DQA1;HLA-DRB1;LGMN;HLA-DPA1;HLA-DQB1 |
| DEG_up_cluster_12 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 11/14 | 8.96763056988054e-19 | CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DOA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1 |
| DEG_down_cluster_1 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 3/16 | 0.00193713461963498 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_1 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 7/97 | 4.80272962533632e-07 | CD74;FCER1G;AP1S2;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_1 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 4/14 | 1.81644797070467e-06 | CD74;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_2 | GO_Molecular_Function_2018 | MHC protein complex binding (GO:0023023) | 5/19 | 1.26946118172402e-06 | CD74;HLA-DMA;HLA-DRA;KLRD1;HLA-DRB1 |
| DEG_down_cluster_2 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 8/97 | 7.35030942410114e-07 | CD74;HLA-DMA;HLA-DPB1;HLA-DRA;CTSD;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_2 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 6/14 | 3.02060205795267e-10 | CD74;HLA-DMA;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_3 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 3/16 | 0.000737638737958742 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_3 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 6/97 | 3.97837837930265e-06 | CD74;AP1S2;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_3 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 4/14 | 4.86393392185293e-07 | CD74;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_4 | GO_Molecular_Function_2018 | actin binding (GO:0003779) | 13/254 | 0.000300156121957636 | ITGB1;CAP1;ARPC1B;DSTN;IQGAP2;SYNE2;TMSB4X;MYH9;FLNA;LCP1;PFN1;ALDOA;MYO1F |
| DEG_down_cluster_4 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 15/88 | 4.03909158166256e-12 | MSN;CD3G;CD3E;SLA2;CD3D;CD2;ZAP70;PTPRC;CD8B;LCK;CD8A;CD7;FYN;LCP1;LAT |
| DEG_down_cluster_4 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 8/16 | 1.19179622306178e-10 | ZAP70;CD6;CD8B;CD8A;CD3G;CD247;CD3E;CD3D |
| DEG_down_cluster_5 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 57/1387 | 7.59568149955502e-23 | RPL5;RPL30;RPL3;RPL32;RPL31;RPLP0;RPL10A;SYNE1;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPS18;RPL36;RPL38;RPL37;RPS10;RPS7;RPS8;RPS5;RPS6;CIRBP;RPL13A;RPSA;RPL37A;RPL29;DDX6;RPL10;RPL12;RPL11;RPS4Y1;ZFP36L2;RPS15A;RPS3;RPL14;RPL15;RPS2;RPS27A;RPL18;LYAR;RPL19;RBM38;HSPA8;RPL41;NPM1;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;CALR;FAU;RPS21;NOP53 |
| DEG_down_cluster_5 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 48/89 | 2.88303170655403e-75 | RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPL12;RPLP0;RPL11;RPL10A;RPS4Y1;RPS4X;RPS15;RPL7A;RPS14;RPS16;RPS15A;RPS19;RPS18;RPL36;RPL14;RPS3;RPLP2;RPL38;RPL37;RPL15;RPS2;RPL18;RPS27A;RPS10;RPL19;RPL41;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL29;RPS21 |
| DEG_down_cluster_5 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 49/124 | 3.00367186609762e-69 | RPL5;RPL30;RPL3;RPL32;RPL31;RPLP0;RPL10A;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPL36AL;RPS18;RPL36;RPLP2;RPL38;RPL37;RPS10;RPS7;RPS8;RPS5;RPS6;RPL13A;RPSA;RPL37A;RPL29;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPS21 |
| DEG_down_cluster_6 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 3/16 | 0.00193713461963498 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_6 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 5/97 | 0.00159727585213224 | CD74;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_6 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 4/14 | 1.81644797070467e-06 | CD74;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_7 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 3/16 | 0.00236424694205082 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_7 | GO_Biological_Process_2018 | cytokine-mediated signaling pathway (GO:0019221) | 9/633 | 0.00224498626907092 | IFITM3;NFKBIA;HLA-DRA;FOS;LTB;VIM;IL7R;JUNB;HLA-DRB1 |
| DEG_down_cluster_7 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 3/14 | 0.000596771775624343 | CD74;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_8 | GO_Molecular_Function_2018 | cadherin binding (GO:0045296) | 13/313 | 4.36984821605319e-05 | ITGB1;ANXA1;HSPA5;ANXA2;AHNAK;IQGAP1;EFHD2;FLNA;TAGLN2;PFN1;EZR;CLIC1;S100A11 |
| DEG_down_cluster_8 | GO_Biological_Process_2018 | platelet degranulation (GO:0002576) | 12/124 | 4.2361841852672e-08 | LYN;SRGN;CD63;TGFB1;APLP2;PSAP;PLEK;ANXA5;FLNA;TAGLN2;CTSW;TUBA4A |
| DEG_down_cluster_8 | GO_Cellular_Component_2018 | focal adhesion (GO:0005925) | 18/356 | 1.25359110771497e-09 | ITGB1;ANXA1;TPM4;HSPA5;AHNAK;CD81;ARPC1B;ANXA5;IQGAP1;ACTB;ARPC2;CAPN2;FLNA;CALR;LCP1;PFN1;EZR;CD99 |
| DEG_down_cluster_9 | GO_Molecular_Function_2018 | MHC class II protein complex binding (GO:0023026) | 4/16 | 2.28220256145266e-05 | CD74;HLA-DMA;HLA-DRA;HLA-DRB1 |
| DEG_down_cluster_9 | GO_Biological_Process_2018 | antigen processing and presentation of exogenous peptide antigen (GO:0002478) | 8/97 | 5.04886106824468e-08 | CD74;HLA-DMA;HLA-DPB1;HLA-DRA;CTSD;CTSS;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_9 | GO_Cellular_Component_2018 | MHC class II protein complex (GO:0042613) | 6/14 | 4.11555274787584e-11 | CD74;HLA-DMA;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DPA1 |
| DEG_down_cluster_10 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 42/1387 | 1.73489121755495e-13 | RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP0;RPL10A;RPL9;ZFP36L2;SYNE1;RPS4X;RPS14;RPL7A;RPS15A;RPS18;RPS3;RPL14;RPL13;RPL37;RPS27A;LYAR;RPL19;RBM38;JUN;NPM1;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPS3A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPL37A;RPS20;RPS21 |
| DEG_down_cluster_10 | GO_Biological_Process_2018 | SRP-dependent cotranslational protein targeting to membrane (GO:0006614) | 39/89 | 2.83100252338431e-58 | RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP1;RPLP0;RPL10A;RPL9;RPS4X;RPL7A;RPS14;RPS15A;RPS18;RPL14;RPS3;RPLP2;RPL13;RPL37;RPS27A;RPL19;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPS3A;RPSA;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPS20;RPS21 |
| DEG_down_cluster_10 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 39/124 | 2.05288819641447e-52 | RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP1;RPLP0;RPL10A;RPL9;RPS4X;RPS14;RPL7A;RPS15A;RPS18;RPL14;RPS3;RPLP2;RPL13;RPL37;RPS27A;RPL19;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPS3A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPS20;RPS21 |
| DEG_down_cluster_11 | GO_Molecular_Function_2018 | RNA binding (GO:0003723) | 242/1387 | 3.33622644554002e-104 | RPL4;EIF4A2;RPL5;EIF4A1;RPL30;RPL3;RPL32;RPL31;RPL34;HNRNPU;HNRNPR;RPL8;RPL9;RPL6;RPL7;RPS15;RPS14;PNN;RPS17;RPS16;LGALS1;RPL18A;RPS19;SNRPD2;RPS18;SNRPD1;RPL36;RPL35;ARL6IP4;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;DDX17;SRRM2;RPL21;SCAF11;RPL23;RPL22;CIRBP;SRRM1;FAM133B;DDX39A;SUB1;RPL24;RPL27;RPL26;RPL29;EZR;RPL28;SF1;BAZ2A;ZC3HAV1;RPS4Y1;SAMSN1;MRPL10;MTDH;DHX36;LYAR;BTF3;RPL41;JUN;PRPF38B;XRCC6;PRRC2C;PA2G4;HNRNPL;RPS26;BST2;HNRNPM;RPS25;RPS28;RPS27;RPL27A;HNRNPD;RPS20;HNRNPC;CALR;RPS21;RPS24;RPS23;SMG1;RPLP0;C1QBP;RACK1;SEC61B;ANXA2;PRPF40A;PPHLN1;NSA2;HNRNPH1;CSDE1;CANX;ARHGEF1;PABPC1;SF3B2;RPL10;AHNAK;RNMT;RPL12;RPL11;EXOSC6;BCLAF1;RPS15A;SRP72;TCOF1;RPL14;RPS3;RPL13;RPS2;RPL15;RPL18;RPS27A;SF3B1;SRSF10;SRSF11;RPL19;NPM1;RSRC2;KTN1;RSL1D1;RPL22L1;FAU;PEBP1;NUCKS1;RPL10A;BZW1;FBL;ZFP36;ZNF207;HMGN2;CAST;PNISR;RPS9;RPS7;RPS8;RPS5;RPS6;NOSIP;RPSA;PRPF4B;ILF2;TUFM;EEF1A1;ILF3;HNRNPUL1;NCL;SRSF2;SNRPG;SBDS;S100A4;ERH;SRSF5;SNRPF;PPIG;SNRNP200;PPIB;SRSF7;PPIA;KPNB1;KHDRBS1;DDX6;DDX5;CORO1A;ZFP36L2;HSP90B1;HNRNPDL;TPR;UBC;H1FX;HSPA9;HSPA8;FUS;NONO;TBCA;EEF2;LSM3;LSM8;SON;EIF2S3;CNOT1;UBE2N;RSL24D1;RBM25;HSP90AB1;DDX3X;CELF1;CELF2;ADAR;CHD2;SYNE1;RPS4X;RBM3;RPL7A;SUMO1;RBM5;HSP90AA1;RPL13A;RPS3A;SFPQ;NEMF;SYF2;EEF1D;THRAP3;RPL37A;LUC7L3;SNRPA1;DDX24;DDX21;U2AF1;SAMHD1;IFI16;PABPN1;TRA2A;EIF4H;SAP18;HNRNPA1;EIF4B;RBM39;HNRNPA3;NOP58;CNBP;CCDC59;RPL35A;RPL23A;HSPE1;EIF3L;EIF3G;EIF3H;EIF3E;VIM;SSBP1;NOP53;EIF3D;RBMX;TPT1;EIF3A;TNRC6A;RAN;TNRC6B |
| DEG_down_cluster_11 | GO_Biological_Process_2018 | protein targeting to ER (GO:0045047) | 81/97 | 3.85134564072111e-99 | RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPS18;SEC61G;RPL36;RPL35;RPLP2;RPL38;RPL37;SEC62;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;CHMP4A;RPL26;RPL29;RPL28;UBA52;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;SRP72;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;SPCS2;RPS27;RPS29;RPL27A;RPS20;RPS21;RPS24;RPS23 |
| DEG_down_cluster_11 | GO_Cellular_Component_2018 | cytosolic ribosome (GO:0022626) | 81/124 | 7.88180682609739e-85 | RPL4;RPL5;RPL30;RPL3;DDX3X;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPL36AL;RPS18;RPL36;RACK1;RPL35;RPLP2;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RSL1D1;RPS28;RPS27;RPS29;RPL27A;RPS20;RPL22L1;RPS21;RSL24D1;RPS24;RPS23 |
| DEG_down_cluster_12 | GO_Molecular_Function_2018 | T cell receptor binding (GO:0042608) | 4/6 | 5.4641345261933e-06 | LCK;CD3G;CD3E;HLA-E |
| DEG_down_cluster_12 | GO_Biological_Process_2018 | T cell activation (GO:0042110) | 11/88 | 5.98573039475656e-10 | CD2;ZAP70;CD8B;LCK;CD8A;CD7;RHOH;CD3G;FYN;CD3E;CD3D |
| DEG_down_cluster_12 | GO_Cellular_Component_2018 | T cell receptor complex (GO:0042101) | 8/16 | 4.66933721922445e-13 | ZAP70;CD6;CD8B;CD8A;CD3G;CD247;CD3E;CD3D |
We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.
# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap = cbind(Barcode=rownames(loupe_umap), loupe_umap)
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export categorical metadata
loupe_meta = as.data.frame(sc@meta.data)
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
# Export gene sets
loupe_genesets = data.frame(List=paste0("DEG_up_cluster_", sc_markers_filt_up[,"cluster"]),
Name=sc_markers_filt_up[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_up[,"gene"]])
loupe_genesets = rbind(loupe_genesets,
data.frame(List=paste0("DEG_down_cluster_", sc_markers_filt_down[,"cluster"]),
Name=sc_markers_filt_down[,"gene"],
Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_down[,"gene"]]))
genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)) {
tmp_genes = genesets_to_export[[i]]
tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
loupe_genesets = rbind(loupe_genesets,
data.frame(List=i,
Name=tmp_genes,
Ensembl=seurat_rowname_to_ensembl[tmp_genes]))
}
write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")All files generated with this report are written into the provided output folder test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/:
This report was generated using the scrnaseq GitHub repository. Software versions were collected at run time.
out = scrnaseq_session_info(param$path_to_git)
knitr::kable(out, align="l") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))| Name | Version |
|---|---|
| ktrns/scrnaseq | 8d1d98883ccdf813e29e891007ab199db8d70f04 |
| R | R version 3.6.1 (2019-07-05) |
| Platform | x86_64-apple-darwin15.6.0 (64-bit) |
| Operating system | macOS Mojave 10.14.6 |
| Packages | abind1.4-5, AnnotationDbi1.46.1, ape5.3, assertthat0.2.1, backports1.1.5, bibtex0.4.2, Biobase2.44.0, BiocGenerics0.30.0, BiocParallel1.18.1, biomaRt2.40.5, bit1.1-14, bit640.9-7, bitops1.0-6, blme1.0-4, blob1.2.0, boot1.3-23, caTools1.17.1.3, cli2.0.2, cluster2.1.0, codetools0.2-16, colorspace1.4-1, cowplot1.0.0, crayon1.3.4, curl4.3, data.table1.12.6, DBI1.0.0, DelayedArray0.10.0, digest0.6.25, dplyr0.8.3, enrichR2.1, evaluate0.14, fansi0.4.0, farver2.0.1, fitdistrplus1.0-14, future1.15.1, future.apply1.3.0, gdata2.18.0, GenomeInfoDb1.20.0, GenomeInfoDbData1.2.1, GenomicRanges1.36.1, ggplot23.2.1, ggrepel0.8.1, ggridges0.5.1, ggsci2.9, globals0.12.5, glue1.4.1, gplots3.0.1.1, gridExtra2.3, gtable0.3.0, gtools3.8.1, highr0.8, hms0.5.2, htmltools0.4.0, htmlwidgets1.5.1, httr1.4.1, ica1.0-2, igraph1.2.4.2, IRanges2.18.3, irlba2.3.3, jsonlite1.6.1, kableExtra1.1.0, KernSmooth2.23-16, knitcitations1.0.10, knitr1.26, labeling0.3, lattice0.20-38, lazyeval0.2.2, leiden0.3.1, lifecycle0.1.0, listenv0.8.0, lme41.1-21, lmtest0.9-37, lsei1.2-0, lubridate1.7.4, magrittr1.5, MASS7.3-51.4, Matrix1.2-18, matrixStats0.55.0, memoise1.1.0, minqa1.2.4, munsell0.5.0, nlme3.1-142, nloptr1.2.1, npsurv0.4-0, openxlsx4.1.4, patchwork1.0.0, pbapply1.4-2, pillar1.4.2, pkgconfig2.0.3, plotly4.9.1, plyr1.8.4, png0.1-7, prettyunits1.0.2, progress1.2.2, purrr0.3.3, R62.4.1, RANN2.6.1, RColorBrewer1.1-2, Rcpp1.0.3, RcppAnnoy0.0.14, RcppParallel4.4.4, RCurl1.95-4.12, readr1.3.1, RefManageR1.2.12, reshape21.4.3, reticulate1.13, rjson0.2.20, rlang0.4.6, rmarkdown1.18, ROCR1.0-7, RSQLite2.1.4, rstudioapi0.11, rsvd1.0.2, Rtsne0.15, rvest0.3.5, S4Vectors0.22.1, scales1.1.0, sctransform0.2.0, sessioninfo1.1.1, Seurat3.1.5, SingleCellExperiment1.6.0, stringi1.4.3, stringr1.4.0, SummarizedExperiment1.14.1, survival3.1-8, tibble2.1.3, tidyr1.0.0, tidyselect0.2.5, tsne0.1-3, uwot0.1.5, vctrs0.2.0, viridisLite0.3.0, webshot0.5.2, withr2.1.2, xfun0.11, XML3.98-1.20, xml21.2.2, XVector0.24.0, yaml2.2.0, zeallot0.1.0, zip2.0.4, zlibbioc1.30.0, zoo1.8-6 |
Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2). Public Library of Science (PLoS): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1874-1.
Liu, Fenglin, Yuanyuan Zhang, Lei Zhang, Ziyi Li, Qiao Fang, Ranran Gao, and Zemin Zhang. 2019. “Systematic Comparative Analysis of Single-Nucleotide Variant Detection Methods from Single-Cell RNA Sequencing Data.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1863-4.
Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282). American Association for the Advancement of Science (AAAS): 189–96. https://doi.org/10.1126/science.aad0501.
Unknown. 2020a. “Unknown.” https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html. https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html.
———. 2020b. “Unknown.” https://pair-code.github.io/understanding-umap/. https://pair-code.github.io/understanding-umap/.
———. 2020c. “Unknown.” https://amp.pharm.mssm.edu/Enrichr/. https://amp.pharm.mssm.edu/Enrichr/.